library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggpubr)
## Loading required package: ggplot2
library(Biostrings)
## Loading required package: BiocGenerics
## Warning: package 'BiocGenerics' was built under R version 4.0.5
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## Filter, Find, Map, Position, Reduce, anyDuplicated, append,
## as.data.frame, basename, cbind, colnames, dirname, do.call,
## duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
## lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
## pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
## tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:tidyr':
##
## expand
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following objects are masked from 'package:data.table':
##
## first, second
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:data.table':
##
## shift
## Loading required package: XVector
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble 3.1.4 ✔ stringr 1.4.0
## ✔ readr 2.0.1 ✔ forcats 0.5.1
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ BiocGenerics::Position() masks ggplot2::Position(), base::Position()
## ✖ dplyr::between() masks data.table::between()
## ✖ Biostrings::collapse() masks IRanges::collapse(), dplyr::collapse()
## ✖ BiocGenerics::combine() masks dplyr::combine()
## ✖ purrr::compact() masks XVector::compact()
## ✖ IRanges::desc() masks dplyr::desc()
## ✖ S4Vectors::expand() masks tidyr::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ S4Vectors::first() masks dplyr::first(), data.table::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ purrr::reduce() masks IRanges::reduce()
## ✖ S4Vectors::rename() masks dplyr::rename()
## ✖ XVector::slice() masks IRanges::slice(), dplyr::slice()
## ✖ purrr::transpose() masks data.table::transpose()
library(foreach)
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
library(stringdist)
##
## Attaching package: 'stringdist'
## The following object is masked from 'package:tidyr':
##
## extract
library(doParallel)
## Loading required package: iterators
FullDataset=readRDS( "IEDB_VIPR_MIRA_COV2_DATASET.rds")
# filter only class I by length <15
FullDataset=FullDataset %>% mutate(Length = nchar(Peptide)) %>% filter(Length < 15)
WUHAN_COV2_Ep_Coverage=readRDS(file="WUHAN_COV2_Ep_Coverage.rds")
WUHAN_COV2_Ep_Coverage %>% select(Peptide) %>% distinct %>% mutate(Length = nchar(Peptide)) %>% group_by(Length) %>% dplyr::summarise(n=n())%>%arrange(desc(n))%>% mutate(Length = as.character(Length))%>%
mutate(Freq=n/1380)%>% DT::datatable()
WUHAN_LENGTHS_PLT=WUHAN_COV2_Ep_Coverage %>% select(Peptide) %>% distinct %>% mutate(Length = nchar(Peptide)) %>% group_by(Length) %>% dplyr::summarise(n=n())%>%arrange(desc(n))%>% mutate(Length = as.character(Length))%>%
ggbarplot(x="Length",y="n",fill="darkgrey")+ylab("# SARS-CoV-2 Peptides\nto Wuhan-Hu-1")+theme_pubr(base_size = 18)+coord_flip()
ORIGIN_PROTEIN_NUM_PLT=WUHAN_COV2_Ep_Coverage %>% select(Peptide, Protein) %>% distinct() %>% group_by(Protein) %>% dplyr::summarise(n=n())%>% arrange(desc(n)) %>%
ggbarplot(x="Protein",y="n",fill="darkgrey")+ylab("# SARS-CoV-2 Peptides\nto Wuhan-Hu-1")+theme_pubr(base_size = 18)+coord_flip()
CORONAVIRUSSEQS = fread("DATA/Coronavirus_sequences_Full_DT.txt")%>% filter(Virus %in% 'SARS-CoV-2-Wuhan')
CORONAVIRUSSEQS=CORONAVIRUSSEQS %>% mutate(Length = nchar(`Seq.V1`)) %>% select(!c(Virus, `Seq.V1`))
ORIGIN_PROTEIN_FREQ_PLT=WUHAN_COV2_Ep_Coverage %>% select(Peptide, Protein) %>% distinct() %>% group_by(Protein) %>% dplyr::summarise(n=n())%>% inner_join(CORONAVIRUSSEQS) %>% mutate(FreqPerAA = n/Length)%>% arrange(desc(FreqPerAA))%>%
ggbarplot(x="Protein",y="FreqPerAA",fill="darkgrey")+ylab("Freq SARS-CoV-2 Peptides\nper Length Protein to Wuhan-Hu-1")+theme_pubr(base_size = 18)+coord_flip()
## Joining, by = "Protein"
WUHAN_PEPTIDES_BINDING_SCORES=readRDS("ORIGINAL_PEPTIDES_BINDING_SCORES_V2.rds")
WUHAN_HLAS_PLT=WUHAN_PEPTIDES_BINDING_SCORES %>% select(Peptide,Predicted_Binding, Binder)%>% distinct() %>% separate_rows_(cols=c("Predicted_Binding","Binder"),sep=",")%>% select(!Peptide) %>%
group_by(Predicted_Binding, Binder)%>% dplyr::summarise(n=n())%>% ungroup %>%tidyr::complete(Predicted_Binding,Binder, fill = list(n=0))%>% filter(Binder == "BINDER") %>% arrange(desc(n))%>%
mutate(Predicted_Binding = gsub("HLA-","",Predicted_Binding))%>%
ggbarplot(x="Predicted_Binding",y="n",fill="darkgrey")+theme_pubr(base_size = 18)+ylab("Num. SARS-CoV-2 Peptides\n predicted to bind")+xlab("HLA Allele")+rotate_x_text()
## `summarise()` has grouped output by 'Predicted_Binding'. You can override using the `.groups` argument.
MUTATIONS=readRDS("OMICRON_EPITOPE_MUTATIONS.rds")
MUTATIONS_BA1 = readRDS("OMICRON_EPITOPE_MUTATIONS.rds")%>% mutate(Variant = "BA1")
MUTATIONS_SUBVAR=readRDS("OMICRON_EPITOPE_MUTATIONS_SUBVAR.rds")
MUTATIONS_FULL=MUTATIONS_BA1 %>% rbind(MUTATIONS_SUBVAR)%>% distinct()
NUM_MUT_PLT=MUTATIONS_FULL%>% distinct() %>% group_by(Variant)%>% dplyr::summarise(n=n())%>% mutate(Freq=n/1380)%>%
arrange(desc(n))%>% ggplot(aes(x=reorder(Variant,-n),y=n))+geom_bar(stat="identity",fill="#999999",color="black")+ylab("Number of Peptides")+theme_pubr(base_size = 18)+xlab("")+
geom_text(aes(label=round(Freq,digits=3)), nudge_y = 3.5, size=5)
MUTATIONS_FULL%>% distinct() %>% group_by(Variant)%>% dplyr::summarise(n=n())%>% mutate(Freq=n/1380)%>%
arrange(desc(n)) %>% DT::datatable()
NUM_MUT_PUM_PEPS_PLT=MUTATIONS_FULL%>%select(Peptide,MutationPos,Variant) %>% separate_rows_(cols = "MutationPos",sep=",")%>% group_by(Peptide,Variant)%>% dplyr::summarise(NMUT = n_distinct(MutationPos))%>%ungroup %>%
select(NMUT,Variant)%>% group_by(NMUT,Variant) %>% dplyr::summarise(n=n())%>%mutate(Freq=n/sum(n)) %>% ggbarplot(x="NMUT",y="n",fill="darkgrey")+facet_grid(~Variant)+theme_pubr(base_size = 18)+ylab("Number of Peptides")+xlab("Number of Mutations")
## `summarise()` has grouped output by 'Peptide'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'NMUT'. You can override using the `.groups` argument.
MUTATIONS_FULL%>%select(Peptide,MutationPos,Variant) %>% separate_rows_(cols = "MutationPos",sep=",")%>% group_by(Peptide,Variant)%>% dplyr::summarise(NMUT = n_distinct(MutationPos))%>%ungroup %>%
select(NMUT,Variant)%>% group_by(NMUT,Variant) %>% dplyr::summarise(n=n())%>%
dplyr::rename("# Mutations"=NMUT, "# Obs" = n)%>%
readr::write_csv(file="/Users/paulbuckley/Library/CloudStorage/OneDrive-SharedLibraries-Nexus365/WIMM CCB - Koohy Group - Documents/Koohy Group/Effect_Mutation_Tcells/V9/WORKING_VERSION/Main_Tables/Table1.csv")
## `summarise()` has grouped output by 'Peptide'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'NMUT'. You can override using the `.groups` argument.
MUTATIONS_FULL[MUTATIONS_FULL$Protein =="ORF1ab polyprotein",]$Protein = "ORF1ab polyprotein/ORF1a polyprotein"
PROTEIN_MUT_DATA=MUTATIONS_FULL %>% select(Peptide,VariantAlignment,Protein,MutationPos,Variant) %>% distinct() %>% group_by(Protein,Variant)%>% dplyr::summarise(n=n())%>% mutate(Freq=n/sum(n))%>%arrange(desc(n))
## `summarise()` has grouped output by 'Protein'. You can override using the `.groups` argument.
PROTEIN_MUT_DATA=PROTEIN_MUT_DATA %>% mutate(Protein=gsub(" |glycoprotein|phosphoprotein|polyprotein|protein","",Protein))%>% ungroup()
NUM_PROT_PLT=PROTEIN_MUT_DATA%>%
ggbarplot(x="Protein",y="n",fill="darkgrey")+facet_grid(~Variant)+theme_pubr(base_size = 18)+ylab("Number of Peptides")+coord_flip()
AA_CHANGES_PLT=MUTATIONS_FULL %>% separate_rows_(cols = c("MutationPos", "Mutation"),sep=",") %>%mutate(MutationPos=as.numeric(MutationPos))%>% mutate(SeqMutationPos = ((MutationPos-start_pos)+1) )%>%
mutate(ChangeFrom = stringr::str_sub(Mutation, 1,1))%>% mutate(ChangeTo = stringr::str_sub(Mutation, -1,-1))%>% mutate(Length = nchar(Peptide))%>%
mutate(SeqChange = paste0(ChangeFrom,"_",ChangeTo))%>%group_by(SeqChange, SeqMutationPos, Length, Variant)%>% dplyr::summarise(n=n())%>%filter(Length %in% c(9,10))%>%
mutate(SeqMutationPos_chr = as.character(SeqMutationPos))%>% filter(Length == 9)%>%mutate(Length = paste0("Length: ",Length))%>%
mutate(PROLINE = grepl("P_",SeqChange))%>%
ggplot(aes(y=SeqChange,x=reorder(SeqMutationPos_chr, SeqMutationPos), color=PROLINE))+geom_point(aes(size=3))+facet_wrap(~Length+Variant,scales="free",ncol=4)+theme_pubr(base_size = 18)+
ylab("Amino Acid Change")+xlab("Epitope Position")+grids()+
theme(axis.text.y=element_text(hjust=0.1,vjust=0.1))+theme(legend.position = "none")+scale_color_manual(values = c("#666666","#e34a33"))
## `summarise()` has grouped output by 'SeqChange', 'SeqMutationPos', 'Length'. You can override using the `.groups` argument.
SUPP_AA_CHANGES_PLT=MUTATIONS_FULL %>% separate_rows_(cols = c("MutationPos", "Mutation"),sep=",") %>%mutate(MutationPos=as.numeric(MutationPos))%>% mutate(SeqMutationPos = ((MutationPos-start_pos)+1) )%>%
mutate(ChangeFrom = stringr::str_sub(Mutation, 1,1))%>% mutate(ChangeTo = stringr::str_sub(Mutation, -1,-1))%>% mutate(Length = nchar(Peptide))%>%
mutate(SeqChange = paste0(ChangeFrom,"_",ChangeTo))%>%group_by(SeqChange, SeqMutationPos, Length, Variant)%>% dplyr::summarise(n=n())%>%filter(Length %in% c(9,10))%>%
mutate(SeqMutationPos_chr = as.character(SeqMutationPos))%>% filter(Length == 10)%>%mutate(Length = paste0("Length: ",Length))%>%
mutate(PROLINE = grepl("P_",SeqChange))%>%
ggplot(aes(y=SeqChange,x=reorder(SeqMutationPos_chr, SeqMutationPos), color=PROLINE))+geom_point(aes(size=3))+facet_wrap(~Length+Variant,scales="free",ncol=4)+theme_pubr(base_size = 18)+
ylab("Amino Acid Change")+xlab("Epitope Position")+grids()+
theme(axis.text.y=element_text(hjust=0.1,vjust=0.1))+theme(legend.position = "none")+scale_color_manual(values = c("#666666","#e34a33"))
## `summarise()` has grouped output by 'SeqChange', 'SeqMutationPos', 'Length'. You can override using the `.groups` argument.
A_C_GRID=cowplot::plot_grid(WUHAN_LENGTHS_PLT, ORIGIN_PROTEIN_NUM_PLT, ORIGIN_PROTEIN_FREQ_PLT,nrow=1,rel_widths = c(0.6,1.2,1.2), align="h")
A_C_GRID
{,dpi=300, fig.height = 6, fig.width = 20} D_E_F_GRID=cowplot::plot_grid(NUM_MUT_PLT,Freq_MUT_PLT, NUM_MUT_PUM_PEPS_PLT,NUM_PROT_PLT,nrow=1,rel_widths = c(0.75,0.75,1.0,1.5), align="h") D_E_F_GRID
D_E_F_GRID=cowplot::plot_grid(NUM_MUT_PLT, NUM_MUT_PUM_PEPS_PLT,NUM_PROT_PLT,nrow=1,rel_widths = c(0.55,0.8,1.65), align="h")
## Warning: Graphs cannot be horizontally aligned unless the axis parameter is set.
## Placing graphs unaligned.
D_E_F_GRID
G_GRID=cowplot::plot_grid(AA_CHANGES_PLT, align="h")
G_GRID
HLA_GRID = cowplot::plot_grid(WUHAN_HLAS_PLT, nrow=1)
#A_F_GRID=cowplot::plot_grid(A_C_GRID,D_E_F_GRID, align="hv", axis="bl",nrow=2)
cowplot::plot_grid(D_E_F_GRID, G_GRID, nrow=2, align="v",axis="l", rel_heights = c(0.6,1.4))
cowplot::plot_grid(A_C_GRID,SUPP_AA_CHANGES_PLT,nrow=3, rel_heights = c(0.8,0.8,1.4))